Source Code of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is HadoopRunWriter.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Richard McCreadie <richardm{a.}dcs.gla.ac.uk> (original author)
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
 *   
 */
package org.terrier.structures.indexing.singlepass.hadoop;


import java.io.IOException;


import org.apache.hadoop.mapred.OutputCollector;


import org.terrier.compression.MemorySBOS;
import org.terrier.structures.indexing.singlepass.Posting;
import org.terrier.structures.indexing.singlepass.RunWriter;


/** RunWriter for the MapReduce indexer. Provides functionality to write
 * term posting lists out to the map task outputcollector during a MapReduce
 * indexing job. Map and flush numbers are also passed with the posting list to
 * allow for docids to be corrected later from side-effect files.
 * @author Richard McCreadie and Craig Macdonald 
 **/
public class HadoopRunWriter extends RunWriter {
  /** output collector of Map task */
  protected OutputCollector<SplitEmittedTerm, MapEmittedPostingList> outputCollector = null;
  /** map task id that is being flushed */
  protected String mapId;
  /** flushNo is the number of times this map task is being flushed */
  protected int flushNo;
  /** The id for this split within the map task that is being flushed */
  protected int splitId;
  
  /** Create a new HadoopRunWriter, specifying the output collector of the map task
   * the run number and the flush number.
   * @param _outputCollector where to emit the posting lists to
   * @param _mapId the task id of the map currently being processed
   * @param _flushNo the number of times that this map task has flushed
   */
  public HadoopRunWriter(OutputCollector<SplitEmittedTerm, MapEmittedPostingList> _outputCollector,
      String _mapId, int _splitId, int _flushNo)
  {
    this.outputCollector = _outputCollector;
    this.mapId = _mapId;
    this.flushNo = _flushNo;
    this.splitId = _splitId;
    this.info = "HadoopRunWriter(Map "+ mapId +", flush "+flushNo+")"; 
  }
  
  @Override
  public void beginWrite(int maxSize, int size) throws IOException
  {}
  
  /** Write the posting to the output collector
   */
  @Override
  public void writeTerm(final String term, final Posting post) throws IOException
  {  
    final MemorySBOS Docs = post.getDocs();
    Docs.pad();
    //get the posting array buffer
    byte[] buffer = new byte[Docs.getMOS().getPos()+1];
    System.arraycopy(Docs.getMOS().getBuffer(), 0, 
        buffer, 0, 
        Math.min(Docs.getMOS().getBuffer().length, Docs.getMOS().getPos()+1));
    
    //emit the term and its posting list
    outputCollector.collect(
        SplitEmittedTerm.createNewTerm(term, splitId, flushNo), 
        MapEmittedPostingList.create_Hadoop_WritableRunPostingData(
            mapId,
            flushNo, 
            splitId,
            buffer,
            post.getDocF(), post.getTF()));
  }
  
  @Override
  public void finishWrite() throws IOException
  {}
  
  /** This RunWriter does not require that the output be sorted.
    */
  @Override
  public boolean writeSorted()
  {
    return false;
  }
}
Source Code of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter

Related Classes of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter